# coding=utf8
import requests
import gevent
from gevent import monkey
import re

from model import SinaBlog

monkey.patch_socket()
from scrapy import Selector
import mongoengine

menu = requests.get('http://blog.sina.com.cn/').content
text = menu.decode('gbk', 'ignore')
urls = set(re.findall(r'http\://blog\.sina\.com\.cn/s/.{21}\.html\?tj\=1', text))



def download_requests(url):
    try:
        blog = SinaBlog()
        content = requests.get(url).content.decode('utf8')
        sel = Selector(text=content)
        blog.url = url
        blog.text = ''.join(sel.xpath('//*[@id="sina_keyword_ad_area2"]//*/text()').extract())
        blog.tags = sel.xpath('//*[@id="sina_keyword_ad_area"]/table/tr/td[1]/h3/a/text()').extract()
        blog.title = sel.css('.articalTitle h2::text').extract()[0]
        blog.save()
        print("success",url)
    except Exception as e:
        print('error:',e)

from gevent.pool import Pool
pool = Pool(5)
jobs = [pool.spawn(download_requests, url) for url in urls]
gevent.joinall(jobs)  




